library(haven)
library(foreign)
library(readxl)
library(openxlsx)
library(car)
library(dplyr)
library(readstata13)


########## Import survey data ##########
d <- read.dta13("~/PhD/Survey Data for Opinion_Policy Dataset/DONE TNS Gallup AS, 2008, september/MMA1088_F1.dta", encoding="UTF-8")
names(d)

######################################################################
#--------------------------  VARIABLES ------------------------------# 
######################################################################

# INSERT row number of issue in excell docuemnt (next number after previous entry)
issue_row_num = 1

# INSERT a policy item variable (Values = FAV, OPP, DK) and the name of that variable 
# Recode according to rules specified in separate document
d$polpref <- car::recode(d$NAME, " ' '='FAV'; ' '='OPP'; ' '='DK'")
polpref_var_name <- "NAME"

# INSERT Gender variable (1=MALE, 2=FEMALE)
d$gender <- car::recode(d$kjoenn, "'Mann'='MALE'; 'Kvinne'='FEMALE'; else=NA")
table(d$gender)

# INSERT Age group variable (categorical, X number of groups, ordered youngest to oldest (1,2,...X), watch out for DK)
# d$age <- factor(d$alder_oevre, labels=c("1","2","3","4","5","6","7","8","9","10","11","12"))
# table(d$age)

# INSERT Numeric AGE variable. Turns it into 10 equal-N categories
d$age <- d$alderkat2
d %>% 
  group_by(age = ntile(age, 10)) %>% # split variable into k groups
  mutate(age_cats = paste0(min(age), "-", max(age))) %>% # create the ranges
  ungroup() -> d
table(d$age)

# INSERT Eduation group variable (categorical, X number of groups, ordered lowest to highest (1,2,...X), watch out for DK) 
d$edu <- ifelse(d$utdann_nivaa3=="Ubesvart", NA, d$utdann_nivaa3)
d$edu <- factor(d$edu, labels=c("1","2","3"))
table(d$edu)

# INSERT income group varible (categorical, X number of groups, ordered lowest to highest (1,2,...X), watch out for DK)
# d$income <- factor(d$respinnt_oevre, labels=c("1","2","3","4","5","6","7","8","9"))
# table(d$income)

d$income <- d$respinnt
d %>% 
  group_by(income = ntile(income, 10)) %>% # split variable into k groups
  mutate(income_cats = paste0(min(income), "-", max(income))) %>% # create the ranges
  ungroup() -> d
table(d$income)

# INSERT Numeric HOUSHOLD Income variable. Turns it into 10 equal-N categories
d$hincome <- d$faminnt
d %>% 
  group_by(hincome = ntile(hincome, 10)) %>% # split variable into k groups
  mutate(hincome_cats = paste0(min(hincome), "-", max(hincome))) %>% # create the ranges
  ungroup() -> d
table(d$hincome)


# INSERT Region variable

# OR 
# INSERT Fylke variable, NB: HEDMARK, ?ST-AGDER
d$fylke <- car::recode(d$fylke, "'Aust-Agder'='?st-Agder'; 'Hedemark'='Hedmark'")
table(d$fylke)

# INSERT Party variable (What will they vote if election tomorrow) (Cateogrical, no order, but use same abbreviatons as in master data)
d$party <- car::recode(d$stemte_stortingsvalg05, "'Arbeiderpartiet'='AP'; 'Det norske Arbeiderparti'='AP'; 'H?yre'='H'; 'AKP (m-l)'='AKP'; 'AKP (Arbeidernes Kommunistparti (m-l)'='AKP'; 'Kommunistene (NKP)'='NKP'; 'Kristelig Folkeparti'='KRF'; 'Senterpartiet'='SP'; 'Sosialistisk Folkeparti'='SF'; 'Sosialistisk Folkeparti eller Sosialistisk Valgforbund'='SF'; 
                       'Sosialistisk Venstreparti'='SV'; 'Venstre'='V'; 'Det nye Folkeparti'='DNF'; 'Det nye Folkepartiet'='DNF'; 'Anders Langes Parti'='ALP'; 'Fremskrittspartiet'='FRP'; 'Kystpartiet'='KY'; 'R?d Valgallianse'='RV'; 'Hadde ikke stemmerett'='NOVOTE'; 'Stemte ikke'='NOVOTE'; else=NA")  
table(d$party)


######################################################################
#---------------------------- START ---------------------------------# 
######################################################################

# Age distribution 
age_dist <- as.data.frame(table(d$age, d$polpref))
age_dist$name <- paste("AGE", age_dist$Var1, sep ="")
age_dist$name <- paste(age_dist$name, age_dist$Var2, sep = "_")
age_dist <- age_dist %>% select(name, Freq)

# Education distribution
edu_dist <- as.data.frame(table(d$edu, d$polpref))
edu_dist$name <- paste("EDU", edu_dist$Var1, sep ="")
edu_dist$name <- paste(edu_dist$name, edu_dist$Var2, sep = "_")
edu_dist <- edu_dist %>% select(name, Freq)

# Income INDIVIDUAL distribution
inc_dist <- as.data.frame(table(d$income, d$polpref))
inc_dist$name <- paste("INC", inc_dist$Var1, sep ="")
inc_dist$name <- paste(inc_dist$name, inc_dist$Var2, sep = "_")
inc_dist <- inc_dist %>% select(name, Freq)

# Income HOUSHOLD distribution
hinc_dist <- as.data.frame(table(d$hincome, d$polpref))
hinc_dist$name <- paste("HINC", hinc_dist$Var1, sep ="")
hinc_dist$name <- paste(hinc_dist$name, hinc_dist$Var2, sep = "_")
hinc_dist <- hinc_dist %>% select(name, Freq)

# Party distribution
p_dist <- as.data.frame(table(d$party, d$polpref))
p_dist$name <- paste("P", p_dist$Var1, p_dist$Var2, sep ="_")
p_dist <- p_dist %>% select(name, Freq)

# Fylke distribution
fy_dist <- as.data.frame(table(d$fylke, d$polpref))
fy_dist$name <- paste("FY", fy_dist$Var1, fy_dist$Var2, sep ="_")
fy_dist <- fy_dist %>% select(name, Freq)

# Gender
g_dist <- as.data.frame(table(d$gender, d$polpref))
g_dist$name <- paste(g_dist$Var1, g_dist$Var2, sep ="_")
g_dist <- g_dist %>% select(name, Freq)

# Overall
o_dist <- as.data.frame(table(d$polpref))
o_dist$name <- paste("OVERALL", o_dist$Var1, sep ="_")
o_dist <- o_dist %>% select(name, Freq)

###### Single variable distributions: 
# Single Age distribution 
age_dist_s <- as.data.frame(table(d$age))
age_dist_s$name <- paste("S_AGE", age_dist_s$Var1, sep ="")
age_dist_s <- age_dist_s %>% select(name, Freq)

# Single Education distribution
edu_dist_s <- as.data.frame(table(d$edu))
edu_dist_s$name <- paste("S_EDU", edu_dist_s$Var1, sep ="")
edu_dist_s <- edu_dist_s %>% select(name, Freq)

# Single Income INDIVIDUAL distribution
inc_dist_s <- as.data.frame(table(d$income))
inc_dist_s$name <- paste("S_INC", inc_dist_s$Var1, sep ="")
inc_dist_s <- inc_dist_s %>% select(name, Freq)

# Single Income HOUSHOLD distribution
hinc_dist_s <- as.data.frame(table(d$hincome))
hinc_dist_s$name <- paste("S_HINC", hinc_dist_s$Var1, sep ="")
hinc_dist_s <- hinc_dist_s %>% select(name, Freq)

# Rbind the data frames
m1 <- rbind(age_dist, edu_dist, inc_dist, hinc_dist, p_dist, age_dist_s, edu_dist_s, hinc_dist_s, inc_dist_s, fy_dist, o_dist, g_dist) 
m2 <- data.frame(t(m1))
colnames(m2) <- as.character(unlist(m2[1,]))
m2 = m2[-1, ]
m2$ISSUE <- issue_row_num
m2 <- mutate_all(m2, function(x) as.numeric(as.character(x)))

m2$ORG_VARNAME <- polpref_var_name
m2$ORG_DATASET_N <- nrow(d)

# Rbind with master data 
master_data <- read_excel("~/PhD/Dataset/PhD Dataset okt 3.xlsx")
rbind.all.columns <- function(x, y) {
  x.diff <- setdiff(colnames(x), colnames(y))
  y.diff <- setdiff(colnames(y), colnames(x))
  
  x[, c(as.character(y.diff))] <- NA
  
  y[, c(as.character(x.diff))] <- NA
  
  return(rbind(x, y))
}
master_data <- rbind.all.columns(master_data, m2)
master_data <- master_data %>% arrange(ISSUE)
write.xlsx(master_data, '~/PhD/Dataset/PhD Dataset okt 3.xlsx')

table(d$polpref,d$fylke)
table(d$polpref,d$age)
table(d$polpref,d$gender)
table(d$polpref,d$income)
table(d$polpref,d$edu)

######################################################################
#----------------------------- END ----------------------------------# 
######################################################################